QC of scRNA-seq

Author

Mohamed Mabrouk

Published

August 16, 2024

Code
"in the following cell, override the default pipeline parameters if needed"

# QC Params
CONCAT_SAMPLES: bool = True  # Concatenate all samples in one object, default: true
CORRECT_AMBIENT_RNA: bool = (
    False  # Correct ambient RNA, uses DecontX, Currently causes multiple erros.
)
FILTER_DOUBLETS: bool = False  # Filter doublets using Scrublet
CELL_CYCLE_SCORE: bool = (
    True  # Calculate cell cycle scores, based on scanpy implementation.
)

# TODO: Move to normalization
# VARS_TO_REGRESS: List[str] = []     # list of regress (pct_counts_mt, pct_counts_ribo).
# REGRESS: bool = False               # Regress out unwanted variables. Not recommended.


############################################################
####################     QC Dict      ######################
############################################################

"""
Can either be a flat-dict for global threshold or a dict of dicts for each sample 
Entries in the form of col_name: [low, high] Ex: 'pct_counts_mt':[0, 20] which will be used as (low, high) pair.
Also entries can be a single number Ex: 'pct_counts_mt': 3 which be used as number of MADS as follows media -+ nmad * MAD. 
"""

# by default, filtering on read counts, number of features, and mitochondrial content
qc_dict: Dict[str, List] | Dict[str, Number] = {
    "n_genes_by_counts": 5,
    "total_counts": 5,
    "pct_counts_mt": 3,
}

# MODIFY_ME, Specifiy only after running diagnotics on the samples
# global config
# qc_dict: Dict[str, List] | Dict[str, List] = {
#     "pct_counts_mt": [0, 10],
#     "pct_counts_rb": [0, 10],
#     "n_genes_by_counts": [1500, 9000],
#     "total_counts": [3_000, 40_000]
# }


# # per-sample config
# qc_dict: Dict[str, List] | Dict[str, List] = {
#     "sample1": {
#         "pct_counts_mt": [0, 10],
#         "pct_counts_rb": [0, 10],
#         "total_counts": [3_000, 40_000]
#     },
#     "sample2": {
#         "pct_counts_rb": [0, 10],
#         "n_genes_by_counts": [1500, 9000],
#         "total_counts": [3_000, 40_000]
#     },
#     "sample3": {
#         "pct_counts_mt": [0, 10],
#         "n_genes_by_counts": [1500, 9000],
#         "total_counts": [3_000, 40_000]
#     },
#     "sample4": {
#         "pct_counts_mt": [0, 10],
#         "pct_counts_rb": [0, 10],
#         "n_genes_by_counts": [1500, 9000],
#     }
# }

Samples

Code
itables.show(pd.DataFrame(samples, index=["sample path"]).T)
sample path
Loading ITables v2.1.3 from the internet... (need help?)

Diagnostic Plots (prior to filtering)

Basic QC plots & metrics

Code
ncols = 2
nrows = len(keys) // ncols + len(keys) % ncols

figsize = 3
wspace = 0.5
hspace = 0.5
fig, axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(
        ncols * figsize + figsize * wspace * (ncols - 1),
        nrows * figsize + hspace * (nrows - 1),
    ),
)
plt.subplots_adjust(wspace=wspace, hspace=hspace)

axs = axs.flatten()

# Prevent the subplots from showing
plt.close(fig)

for i, key in enumerate(keys):
    sc.pl.violin(
        adata, keys=[key], groupby="sample", stripplot=False, inner="box", ax=axs[i]
    )
    axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation=30)

plt.tight_layout()

if len(keys) < nrows * ncols:
    for i in range(len(keys), nrows * ncols):
        fig.delaxes(axs[i])

display(fig)

<Figure size 400x400 with 0 Axes>

Table of basic QC metrics

Code
df1 = adata.obs.groupby("sample")[keys].agg(["mean", "median"]).round(3)
df2 = adata.obs.groupby("sample")[["sample"]].agg(["size"])
itables.show(pd.concat([df1, df2], axis=1))
n_genes_by_counts total_counts pct_counts_mt pct_counts_rb S_score G2M_score sample
mean median mean median mean median mean median mean median mean median size
sample
Loading ITables v2.1.3 from the internet... (need help?)

Histograms

Interactive

Static

Code
import ipywidgets as widgets
from IPython.display import Markdown, display
import io
import base64
import holoviews as hv

n = len(keys)
ncols = 2
nrows = n // ncols + (n % ncols > 0)


quarto_markdown = "::: {.panel-tabset}\n\n"


# Create the subplots
wspace = 0.5
fig, axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(
        ncols * figsize + figsize * wspace * (ncols - 1),
        nrows * figsize + wspace * (nrows - 1),
    ),
)

plt.subplots_adjust(wspace=wspace, hspace=wspace)
# Flatten the axes
axs = axs.flatten()
outlier_dict = {}

# Plot a histogram on each subplot
for i, col in enumerate(keys):
    sns.histplot(data=df, x=col, ax=axs[i], bins=300)

# Remove any unused subplots
if len(df.columns) < nrows * ncols:
    for i in range(len(df.columns), nrows * ncols):
        fig.delaxes(axs[i])

plt.tight_layout()

buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
# Encode the image to base64
img_str = base64.b64encode(buf.read()).decode("utf-8")
# Add the image to the markdown
quarto_markdown += (
    f"### All samples \n\n![All samples](data:image/png;base64,{img_str})\n\n"
)


samples = adata.obs["sample"].unique()
samples = np.sort(samples)
for sample in samples:
    ls = []
    df = adata.obs.loc[adata.obs["sample"] == sample, keys]
    fig, axs = plt.subplots(
        nrows=nrows,
        ncols=ncols,
        figsize=(
            ncols * figsize + figsize * wspace * (ncols - 1),
            nrows * figsize + wspace * (nrows - 1),
        ),
    )

    plt.subplots_adjust(wspace=wspace, hspace=wspace)

    # Flatten the axes
    axs = axs.flatten()
    outlier_dict = {}

    # Plot a histogram on each subplot
    for i, col in enumerate(keys):
        sns.histplot(data=df, x=col, ax=axs[i], bins=300)

    # Remove any unused subplots
    if len(df.columns) < nrows * ncols:
        for i in range(len(df.columns), nrows * ncols):
            fig.delaxes(axs[i])

    plt.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format="png", bbox_inches="tight")
    plt.close()
    buf.seek(0)
    # Encode the image to base64
    img_str = base64.b64encode(buf.read()).decode("utf-8")
    # Add the image to the markdown
    quarto_markdown += (
        f"### {sample} \n\n![{sample}](data:image/png;base64,{img_str})\n\n"
    )

quarto_markdown += ":::\n"
# Display the generated markdown
display(Markdown(quarto_markdown))

All samples

sample1

sample2

sample3

sample4

Scatter plots of confounders

DeconX contamination

Effects of confounders

Filtering

Filtering thresholds

n_genes_by_counts total_counts pct_counts_mt
sample
min sample4 45 500.0 0.459418
sample3 58 500.0 0.458716
sample2 53 500.0 0.462107
sample1 103 500.0 0.453858
max sample4 10568 44201.0 8.502772
sample3 10507 44195.0 8.498425
sample2 9476 43567.0 8.504811
sample1 10149 44147.0 8.499831

Number of outliers based on provided criteria

n_genes_by_counts_outlier total_counts_outlier pct_counts_mt_outlier aggregate outliers Total Cells
sample
Loading ITables v2.1.3 from the internet... (need help?)

Cell filtering based on outlier function

QC plots (post filtering)

Table of basic QC metrics

n_genes_by_counts total_counts pct_counts_mt pct_counts_rb S_score G2M_score sample
mean median mean median mean median mean median mean median mean median size
sample
sample4 4646.962 4966.0 18362.970703 17684.0 4.352 4.243 4.875 4.634 -0.003 -0.191 -0.003 -0.191 6123
sample3 4377.408 4643.5 16685.796875 15659.0 4.079 3.977 5.448 5.258 -0.003 -0.204 -0.003 -0.204 7114
sample2 3402.320 3461.0 9729.103516 8889.5 4.913 4.880 5.756 5.686 -0.023 -0.184 -0.023 -0.184 9248
sample1 4128.167 4280.0 13573.825195 12769.0 4.452 4.351 5.951 5.862 -0.005 -0.197 -0.005 -0.197 8602

Histograms

Interactive

Static

Code
import ipywidgets as widgets
from IPython.display import Markdown, display
import io
import base64
import holoviews as hv

df = adata.obs[keys]


n = len(keys)
ncols = 2
nrows = n // ncols + (n % ncols > 0)


quarto_markdown = "::: {.panel-tabset}\n\n"


# Create the subplots
wspace = 0.5
fig, axs = plt.subplots(
    nrows=nrows,
    ncols=ncols,
    figsize=(
        ncols * figsize + figsize * wspace * (ncols - 1),
        nrows * figsize + wspace * (nrows - 1),
    ),
)

plt.subplots_adjust(wspace=wspace, hspace=wspace)
# Flatten the axes
axs = axs.flatten()
outlier_dict = {}

# Plot a histogram on each subplot
for i, col in enumerate(keys):
    sns.histplot(data=df, x=col, ax=axs[i], bins=300)

# Remove any unused subplots
if len(df.columns) < nrows * ncols:
    for i in range(len(df.columns), nrows * ncols):
        fig.delaxes(axs[i])

plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches="tight")
plt.close()
buf.seek(0)
# Encode the image to base64
img_str = base64.b64encode(buf.read()).decode("utf-8")
# Add the image to the markdown
quarto_markdown += (
    f"### All samples \n\n![All samples](data:image/png;base64,{img_str})\n\n"
)


samples = adata.obs["sample"].unique()
samples = np.sort(samples)
for sample in samples:
    ls = []
    df = adata.obs.loc[adata.obs["sample"] == sample, keys]
    fig, axs = plt.subplots(
        nrows=nrows,
        ncols=ncols,
        figsize=(
            ncols * figsize + figsize * wspace * (ncols - 1),
            nrows * figsize + wspace * (nrows - 1),
        ),
    )

    plt.subplots_adjust(wspace=wspace, hspace=wspace)

    # Flatten the axes
    axs = axs.flatten()
    outlier_dict = {}

    # Plot a histogram on each subplot
    for i, col in enumerate(keys):
        sns.histplot(data=df, x=col, ax=axs[i], bins=300)

    # Remove any unused subplots
    if len(df.columns) < nrows * ncols:
        for i in range(len(df.columns), nrows * ncols):
            fig.delaxes(axs[i])

    plt.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format="png", bbox_inches="tight")
    plt.close()
    buf.seek(0)
    # Encode the image to base64
    img_str = base64.b64encode(buf.read()).decode("utf-8")
    # Add the image to the markdown
    quarto_markdown += (
        f"### {sample} \n\n![{sample}](data:image/png;base64,{img_str})\n\n"
    )

quarto_markdown += ":::\n"
# Display the generated markdown
display(Markdown(quarto_markdown))

All samples

sample1

sample2

sample3

sample4

Scatter plots of confounders after filtering

DecontX contamination

Code
if CORRECT_AMBIENT_RNA and TECHNOLOGY == "10x":
    with rc_context(
        {
            "figure.figsize": (
                adata.obs["decontX_clusters"].astype("int32").max() * 0.7,
                5,
            )
        }
    ):
        sns.violinplot(adata.obs, x="decontX_clusters", y="decontX_contamination")
        sns.stripplot(
            adata.obs, x="decontX_clusters", y="decontX_contamination", s=1, c="black"
        )

Clustering after to cell filtering

Code
# #| echo: false
# #| output: false
# #| warning: false

# ## Regression of Variables

# # - [ ] Add error handling if the vars to regress is empty or contain non-keys
# if REGRESS:
#     sc.pp.regress_out(adata, keys= VARS_TO_REGRESS)

Session Information

Code
session_info.show()
Click to view session information
-----
anndata             0.10.8
holoviews           1.19.0
hvplot              0.10.0
ipywidgets          8.1.3
itables             2.1.3
matplotlib          3.8.4
numpy               1.26.4
pandas              2.2.2
panel               1.4.4
patchworklib        0.6.3
requests            2.32.3
scanpy              1.10.1
scipy               1.14.0
seaborn             0.13.2
session_info        1.0.0
tomlkit             0.12.5
utils               NA
-----
Click to view modules imported as dependencies
PIL                         10.3.0
anyio                       NA
argcomplete                 NA
arrow                       1.3.0
asttokens                   NA
attr                        23.2.0
attrs                       23.2.0
babel                       2.14.0
bleach                      6.1.0
bokeh                       3.4.2
brotli                      1.1.0
bs4                         4.12.3
certifi                     2024.06.02
cffi                        1.16.0
charset_normalizer          3.3.2
colorama                    0.4.6
colorcet                    3.1.0
comm                        0.2.2
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.9.0
debugpy                     1.8.2
decorator                   5.1.1
defusedxml                  0.7.1
dill                        0.3.8
executing                   2.0.1
fastjsonschema              NA
fqdn                        NA
h5py                        3.11.0
htmlparser                  NA
idna                        3.7
igraph                      0.11.5
ipykernel                   6.29.4
isoduration                 NA
jedi                        0.19.1
jinja2                      3.1.4
joblib                      1.4.2
json5                       0.9.25
jsonpointer                 3.0.0
jsonschema                  4.22.0
jsonschema_specifications   NA
jupyter_events              0.10.0
jupyter_server              2.14.1
jupyterlab_pygments         0.3.0
jupyterlab_server           2.27.2
kaleido                     NA
kiwisolver                  1.4.5
legacy_api_wrap             NA
leidenalg                   0.10.2
llvmlite                    0.43.0
markdown                    3.6
markupsafe                  2.1.5
matplotlib_inline           0.1.7
mistune                     3.0.2
mizani                      0.11.4
mpl_toolkits                NA
natsort                     8.4.0
nbclient                    0.10.0
nbconvert                   7.16.4
nbformat                    5.10.4
numba                       0.60.0
overrides                   NA
packaging                   24.1
pandocfilters               NA
param                       2.1.1
parso                       0.8.4
patsy                       0.5.6
pickleshare                 0.7.5
platformdirs                4.2.2
plotly                      5.22.0
plotnine                    0.13.6
prometheus_client           NA
prompt_toolkit              3.0.47
psutil                      6.0.0
pure_eval                   0.2.2
pycparser                   2.22
pydev_ipython               NA
pydevconsole                NA
pydevd                      2.9.5
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.18.0
pynndescent                 0.5.13
pyparsing                   3.1.2
pythonjsonlogger            NA
pytz                        2024.1
pyviz_comms                 3.0.2
referencing                 NA
rfc3339_validator           0.1.4
rfc3986_validator           0.1.1
rpds                        NA
send2trash                  NA
setuptools                  70.1.1
six                         1.16.0
sklearn                     1.5.0
sniffio                     1.3.1
socks                       1.7.1
soupsieve                   2.5
stack_data                  0.6.2
statsmodels                 0.14.2
tenacity                    NA
texttable                   1.7.0
threadpoolctl               3.5.0
tinycss2                    1.3.0
tornado                     6.4.1
tqdm                        4.66.5
traitlets                   5.14.3
typing_extensions           NA
umap                        0.5.5
uri_template                NA
urllib3                     2.2.2
wcwidth                     0.2.13
webcolors                   24.6.0
webencodings                0.5.1
websocket                   1.8.0
xyzservices                 2024.6.0
yaml                        6.0.1
zmq                         26.0.3
zstandard                   0.22.0
-----
IPython             8.25.0
jupyter_client      8.6.2
jupyter_core        5.7.2
jupyterlab          4.2.2
-----
Python 3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:36:13) [GCC 12.3.0]
Linux-6.8.0-40-generic-x86_64-with-glibc2.39
-----
Session information updated at 2024-08-16 23:27